import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
sns.set_theme(color_codes=True)
pd.set_option('display.max_columns', None)
df = pd.read_excel('Top_1000_companies_DataSet.xlsx')
df.drop_duplicates() #no hay dupliciados
df.dtypes # los formatos/tipos de datos
company_name object url object city object state object country object employees object linkedin_url object founded object Industry object GrowjoRanking object Previous Ranking object estimated_revenues object job_openings object keywords object LeadInvestors object Accelerator object btype object valuation float64 total_funding object product_url object indeed_url object growth_percentage float64 contact_info object dtype: object
Eliminar duplicados
df.drop_duplicates()
| company_name | url | city | state | country | employees | linkedin_url | founded | Industry | GrowjoRanking | Previous Ranking | estimated_revenues | job_openings | keywords | LeadInvestors | Accelerator | btype | valuation | total_funding | product_url | indeed_url | growth_percentage | contact_info | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | OpenAI | openai.com | San Francisco | CA | United States | 655 | http://www.linkedin.com/company/openai | 2015 | AI | 1 | 1 | 88722200 | 25 | Artificial Intelligence, Online Gaming, Non Pr... | Thrive, Founders Fund | NaN | NaN | 2.900000e+10 | $11B | https://www.growjo.com/company/OpenAI | https://www.indeed.com/jobs?q=company%3A(OpenAI) | 1.18 | For Contact Direct Phone Numbers and Emails se... |
| 1 | Alchemy | alchemy.com | San Francisco | CA | United States | 201 | http://www.linkedin.com/company/alchemyinc | NaN | Fintech | 2 | 2 | 39269750 | 1 | NaN | Lightspeed, Silver Lake | NaN | NaN | 1.020000e+10 | $564M | https://www.growjo.com/company/Alchemy | https://www.indeed.com/jobs?q=company%3A(Alchemy) | 1.34 | For Contact Direct Phone Numbers and Emails se... |
| 2 | dbt Labs | getdbt.com | Philadelphia | PA | United States | 511 | http://www.linkedin.com/company/dbtlabs | 2016 | Analytics | 3 | 3 | 76987400 | 1 | NaN | Altimeter | NaN | NaN | 4.200000e+09 | $414.4M | https://www.growjo.com/company/dbt_Labs | https://www.indeed.com/jobs?q=company%3A(dbt L... | 0.76 | For Contact Direct Phone Numbers and Emails se... |
| 3 | Wasabi Technologies | wasabi.com | Boston | MA | United States | 355 | http://www.linkedin.com/company/wasabitechnolo... | 2017 | Cloud | 4 | 4 | 38480000 | 4 | saas | L2 Point | NaN | NaN | 1.100000e+09 | $286.2M | https://www.growjo.com/company/Wasabi_Technolo... | https://www.indeed.com/jobs?q=company%3A(Wasab... | 0.60 | For Contact Direct Phone Numbers and Emails se... |
| 4 | Whatnot | whatnot.com | Los Angeles | CA | United States | 551 | http://www.linkedin.com/company/whatnot-inc | 2019 | eCommerce | 5 | 5 | 39664000 | 1 | NaN | DST Global | NaN | NaN | 3.700000e+09 | $484.7M | https://www.growjo.com/company/Whatnot | https://www.indeed.com/jobs?q=company%3A(Whatnot) | 0.97 | For Contact Direct Phone Numbers and Emails se... |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 976 | Forte | forte.io | San Francisco | CA | United States | 145 | http://www.linkedin.com/company/forte-labs-inc | 2018 | Fintech | 992 | 999 | 28369250 | 1 | NaN | NaN | NaN | NaN | 1.000000e+09 | $910M | https://www.growjo.com/company/Forte | https://www.indeed.com/jobs?q=company%3A(Forte) | 0.04 | For Contact Direct Phone Numbers and Emails se... |
| 977 | Collective Health | collectivehealth.com | San Francisco | CA | United States | 615 | http://www.linkedin.com/company/collectivehealth | 2013 | Digital Health | 993 | 1000 | 114858800 | 40 | digital health, healthtech, saas | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 978 | NaN | Google Ventures | NaN | NaN | 1500000000 | $719M | https://www.growjo.com/company/Collective_Health | https://www.indeed.com/jobs?q=company%3A(Colle... | 0.04 | For Contact Direct Phone Numbers and Emails se... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 979 | Fathom (YC W21) | fathom.video | San Francisco | CA | USA | 96 | http://www.linkedin.com/company/fathom-video | 2020 | Tech Services | 999 | 1006 | 14645000 | 1 | NaN | NaN | NaN | NaN | NaN | NaN | https://www.growjo.com/company/Fathom_(YC_W21) | https://www.indeed.com/jobs?q=company%3A(Fatho... | 2.31 | For Contact Direct Phone Numbers and Emails se... |
| 980 | Hone | honehq.com | San Francisco | CA | United States | 179 | http://www.linkedin.com/company/honehq | 2018 | EdTech | 1000 | 1007 | 28294000 | 1 | saas | NaN | NaN | NaN | NaN | $22.4M | https://www.growjo.com/company/Hone | https://www.indeed.com/jobs?q=company%3A(Hone) | 0.60 | For Contact Direct Phone Numbers and Emails se... |
981 rows × 23 columns
Deshacerse ede las fials que contengan datos nan y reamplzar en blanco
df = df.fillna('')
Eliminar las filas que no contengan datos de la tabla principal
for x in df.index:
if df.loc[x, "company_name"] == '':
df.drop(x, inplace=True)
Eliminar todos los datos diferentes que no esten en el abecedario o en los numeros del 1 al 9
df["company_name"] = df["company_name"].str.replace('[^a-zA-Z0-9]',' ')
df["city"] = df["city"].str.replace('[^a-zA-Z0-9]',' ')
C:\Users\ASUS\AppData\Local\Temp\ipykernel_21616\1779931556.py:1: FutureWarning: The default value of regex will change from True to False in a future version.
df["company_name"] = df["company_name"].str.replace('[^a-zA-Z0-9]',' ')
C:\Users\ASUS\AppData\Local\Temp\ipykernel_21616\1779931556.py:2: FutureWarning: The default value of regex will change from True to False in a future version.
df["city"] = df["city"].str.replace('[^a-zA-Z0-9]',' ')
Revisar la informacion de los datos
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 965 entries, 0 to 980 Data columns (total 23 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 company_name 965 non-null object 1 url 965 non-null object 2 city 965 non-null object 3 state 965 non-null object 4 country 965 non-null object 5 employees 965 non-null object 6 linkedin_url 965 non-null object 7 founded 965 non-null object 8 Industry 965 non-null object 9 GrowjoRanking 965 non-null object 10 Previous Ranking 965 non-null object 11 estimated_revenues 965 non-null object 12 job_openings 965 non-null object 13 keywords 965 non-null object 14 LeadInvestors 965 non-null object 15 Accelerator 965 non-null object 16 btype 965 non-null object 17 valuation 965 non-null object 18 total_funding 965 non-null object 19 product_url 965 non-null object 20 indeed_url 965 non-null object 21 growth_percentage 965 non-null object 22 contact_info 965 non-null object dtypes: object(23) memory usage: 213.2+ KB
Eliminar las columnas que no necesitamos
columnas_a_eliminar = ["contact_info","product_url","indeed_url","Accelerator","btype","keywords","linkedin_url","url","Previous Ranking"]
df = df.drop(columns = columnas_a_eliminar)
df = df.drop(680)
df = df.drop(688)
df = df.drop(738)
df = df.drop(740)
Resetear los index porque se acabana de eliminar el index 680
df = df.reset_index(drop=True)
convertir el formato de algunas columnas
Principalmente convertirlas en type str excepto GrowjoRanking que ya es de type int →↓
para poder manipularlas y aplicarles filtros en s
df["employees"] = df["employees"].apply(lambda x: str(x))
df["founded"] = df["founded"].apply(lambda x: str(x))
df["estimated_revenues"] = df["estimated_revenues"].apply(lambda x: str(x))
df["job_openings"] = df["job_openings"].apply(lambda x: str(x))
df["growth_percentage"] = df["growth_percentage"].apply(lambda x: str(x))
Reemplazar valores desconocidos por los valores 0 para int y desconocido para str
df["founded"] = df["founded"].replace('','0')
df["state"] = df["state"].replace('','desconocido')
df["city"] = df["city"].replace('','desconocido')
df["country"] = df["country"].replace('','desconocido')
df["Industry"] = df["Industry"].replace('','desconocido')
df["estimated_revenues"] = df["estimated_revenues"].replace('','0')
df["job_openings"] = df["job_openings"].replace('','0')
df["LeadInvestors"] = df["LeadInvestors"].replace('','desconocido')
df["valuation"] = df["valuation"].replace('','0')
df["total_funding"] = df["total_funding"].replace('','0')
df["growth_percentage"] = df["growth_percentage"].replace('','0')
Convertir las columnas de numeros a enteros
df["employees"] = df["employees"].apply(lambda x: int(x))
df["GrowjoRanking"] = df["GrowjoRanking"].apply(lambda x: int(x))
df["founded"] = df["founded"].apply(lambda x: int(x))
df["valuation"] = df["valuation"].apply(lambda x: int(x))
df["estimated_revenues"] = df["estimated_revenues"].apply(lambda x: float(x))
df["growth_percentage"] = df["growth_percentage"].apply(lambda x: float(x))
df["job_openings"] = df["job_openings"].apply(lambda x: int(x))
Eliminar los caracteres extraños en la columna de tota_funding
df['total_funding'] = df['total_funding'].str.replace(r'[^0-9MmBb.]', '', regex=True)
Necesitamos convertir la columna total_funding en una columna que solo contenga numeros
df['total_funding'] = df['total_funding'].replace({'M': 'e6', 'B': 'e9'}, regex=True)
df['total_funding'] = pd.to_numeric(df['total_funding'], errors='coerce')
df["total_funding"] = df["total_funding"].apply(lambda x: int(x))
Este codigo es preferencial para convertir los datos limpios en xlsx
output_file = "top_companies_cleaning.xlsx"
df.to_excel(output_file, index=False)
df.select_dtypes(include='object').nunique()
company_name 942 city 327 state 54 country 61 Industry 120 LeadInvestors 356 dtype: int64
Neceitamos convertir los paises a regiones y para eso debemos reemplazar los nomnbres de algunos paises
df["country"] = df["country"].replace('AUS','Australia')
df["country"] = df["country"].replace('Aus','Australia')
df["country"] = df["country"].replace('CAN','Canada')
df["country"] = df["country"].replace('GEN','Germany')
df["country"] = df["country"].replace('Ger','Germany')
df["country"] = df["country"].replace('IE','Irelanda')
df["country"] = df["country"].replace('Ind','India')
df["country"] = df["country"].replace('Netharlands','Netherlands')
df["country"] = df["country"].replace('NO','Norway')
df["country"] = df["country"].replace('NOR','Norway')
df["country"] = df["country"].replace('POL','Polonia')
df["country"] = df["country"].replace('SGP','Singapore')
df["country"] = df["country"].replace('SWE','Sweden')
df["country"] = df["country"].replace('USA','United States')
df["country"] = df["country"].replace('United State','United States')
Convertir los paises en regiones
def segment_country(country):
if country in ["China", "Hong Kong", "India", "Indonesia", "Israel", "Japan", "Kuwait", "Pakistan", "Singapore", "South Korea", "Taiwan", "Thailand", "Turkey", "United Arab Emirates"]:
return 'Asia'
elif country in ["Austria", "Belgium", "Cyprus", "Estonia", "Finland", "France", "Germany", "Ireland", "Irelanda", "Italy", "Liechtenstein", "Lithuania", "Netherlands", "Norway", "Poland", "Romania", "Spain", "Sweden", "Switzerland", "UK"]:
return 'Europe'
elif country in ["Canada", "Mexico", "Panama", "United States"]:
return 'North America'
elif country in ["Brazil", "Colombia", "Ecuador"]:
return 'South America'
elif country in ["Egypt", "Kenya", "Namibia", "Seychelles", "South Africa"]:
return 'Africa'
elif country in ["Australia", "New Zealand"]:
return 'Oceania'
else:
return 'Others'
Aplicar la función de segmentación para crear la nueva columna
df['region'] = df['country'].apply(segment_country)
# Agregar una columna de años para hacer graficos de barras por fechas
df['año'] = range(1800, 1800 + len(df))
Graficar Regiones
plt.figure(figsize=(10,5))
df['region'].value_counts().plot(kind='bar')
<Axes: >
#Obtener los nombres de todas las columnas con el tipo de dato objetos (catagorical num)
cat_vars = df.select_dtypes(include='object').columns.tolist()
# Crear los espacios para las gráficas
num_cols = len(cat_vars)
num_rows = (num_cols + 2) // 3
fig, axs = plt.subplots(nrows=num_rows, ncols=3, figsize=(15, 5*num_rows))
axs = axs.flatten()
# Crear un contador de gráficas para los 5 primeros valores de cada variable categórica usando seaborn
for i, var in enumerate(cat_vars):
# Excluir 'desconocido' de las columnas leadinvestors y state
if var in ['LeadInvestors', 'state']:
top_values = df[df[var] != 'desconocido'][var].value_counts().nlargest(10).index
filtered_df = df[df[var].isin(top_values)]
else:
top_values = df[var].value_counts().nlargest(10).index
filtered_df = df[df[var].isin(top_values)]
sns.countplot(x=var, data=filtered_df, ax=axs[i])
axs[i].set_title(var)
axs[i].tick_params(axis='x', rotation=90)
# Eliminar cada espacio extra en los gráficos
if num_cols < len(axs):
for i in range(num_cols, len(axs)):
fig.delaxes(axs[i])
# Ajustar los espacios entre las gráficas
fig.tight_layout()
plt.show()
#conteo_valores = df['LeadInvestors'].value_counts()
#Escoger los nombres de las columnas con los datos tipo 'int' o 'float'
num_vars = df.select_dtypes(include=['int', 'float']).columns.tolist()
#Crear un espacio para cada grafica
num_cols = len(num_vars)
num_rows = (num_cols + 2) //3
fig, axs = plt.subplots(nrows=num_rows, ncols=3, figsize=(15, 5*num_rows))
axs = axs.flatten()
#Crear un boxplot para cada variable númerica usando Seaborn
for i, var in enumerate(num_vars):
sns.boxplot(x=df[var], ax=axs[i])
axs[i].set_title(var)
#Eliminar los expacios extras que no se graficaron
if num_cols < len(axs):
for i in range(num_cols, len(axs)):
fig.delaxes(axs[i])
#Ajustar los espacios entre las graficas
fig.tight_layout()
plt.show()
# Escogemos los nombre de todas las columnas con los datos 'int' "Numeros enteros"
int_vars = df.select_dtypes(include=['int', 'float']).columns.tolist()
#Crear las figuras/espacios para los graficos
num_cols = len(int_vars)
num_rows = (num_cols + 2)// 3 # Asegurate que son los espacios suficientes para todas las graficas
fig, axs = plt.subplots(nrows=num_rows, ncols=3, figsize=(20, 5*num_rows))
axs = axs.flatten()
#crear un box-plot para cada varibles usando seaborn con hue='attritio'
for i, var in enumerate(int_vars):
sns.boxplot(y=var, x='region', data=df , ax=axs[i])
axs[i].set_title(var)
# Eliminar cada espacio extra que o hayan llenado los graficos
if num_cols < len(axs):
for i in range(num_cols, len(axs)):
fig.delaxes(axs[i])
# Ajustar los espacios de os graficos y los titulos
fig.tight_layout()
plt.show()
#Escoger los nombres de todas las columnas que contengan datos 'int' y 'float'
int_vars = df.select_dtypes(include=['int','float']).columns.tolist()
#Crear los esopacios para las graficas
num_cols = len (int_vars)
num_rows = (num_cols + 2) //3
fig, axs = plt.subplots(nrows=num_rows, ncols=3, figsize=(15, 5*num_rows))
axs = axs.flatten()
# Crear un histograma por cada variable entero}
for i, var in enumerate(int_vars):
df[var].plot.hist(ax=axs[i])
axs[i].set_title(var)
# Elimnar los espacios extras y dejar solo los que necesitamos
if num_cols < len(axs):
for i in range(num_cols, len(axs)):
fig.delaxes(axs[i])
# Ajustar los espacios entre las graficas
fig.tight_layout()
plt.show()
#Obtener los nombres de toidas las comlumnas de tipo 'int' (Entero)
int_vars = df.select_dtypes(include=['int', 'float']).columns.tolist()
#Crear una fuigura con los espacios de las graficas
num_cols = len(int_vars)
num_rows = (num_cols + 2) // 3 # To make sure there are enough rows for the subplots
fig, axs = plt.subplots(nrows=num_rows, ncols=3, figsize=(15, 5*num_rows))
axs = axs.flatten()
#Crear un histograma para cada variable con hue='Attrition'
for i, var in enumerate(int_vars):
sns.histplot(data=df, x=var, hue='region', kde=True, ax=axs[i])
axs[i].set_title(var)
# Eliminar los espacios de graficos que no se necesitan
if num_cols < len(axs):
for i in range(num_cols, len(axs)):
fig.delaxes(axs[i])
# Ajustar los espacios entre las graficas
fig.tight_layout()
plt.show()
#Especificar el número maximo de de categorías a mostrar individualmente
max_categories = 7
# Filtrar las columnas categoricas con tip 'objeto'
cat_cols = [col for col in df.columns if df[col].dtype == 'object']
# Crear los espacios de los graficos
num_cols = len(cat_cols)
num_rows = (num_cols + 2) // 3
fig, axs = plt.subplots(nrows=num_rows, ncols=3, figsize=(30, 7*num_rows))
# Aplana la matriz axs para facilitar la indexación
axs = axs.flatten()
# Crear la torta para cada columna categorica
for i, col in enumerate(cat_cols):
if i < len(axs): # Ensure we don't exceed the number of subplots
#Count the number of occurrences for each category
cat_counts = df[col].value_counts()
# Categorías de grupo más allá de max_categories superiores como 'Otros'
if len(cat_counts) > max_categories:
cat_counts_top = cat_counts[:max_categories]
cat_counts_other = pd.Series(cat_counts[max_categories:].sum(), index=["Other"])
cat_counts = cat_counts_top.append(cat_counts_other)
# Crear una torta
axs[i].pie(cat_counts, labels=cat_counts.index, autopct='%1.1f%%', startangle=90)
axs[i].set_title(f'{col} region')
#eliminar cada espacio extra
if num_cols < len(axs):
for i in range(num_cols, len(axs)):
fig.delaxes(axs[i])
# Ajusta el espacio entre las graficas
fig.tight_layout()
plt.show()
C:\Users\ASUS\AppData\Local\Temp\ipykernel_21616\13778241.py:25: FutureWarning: The series.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. cat_counts = cat_counts_top.append(cat_counts_other) C:\Users\ASUS\AppData\Local\Temp\ipykernel_21616\13778241.py:25: FutureWarning: The series.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. cat_counts = cat_counts_top.append(cat_counts_other) C:\Users\ASUS\AppData\Local\Temp\ipykernel_21616\13778241.py:25: FutureWarning: The series.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. cat_counts = cat_counts_top.append(cat_counts_other) C:\Users\ASUS\AppData\Local\Temp\ipykernel_21616\13778241.py:25: FutureWarning: The series.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. cat_counts = cat_counts_top.append(cat_counts_other) C:\Users\ASUS\AppData\Local\Temp\ipykernel_21616\13778241.py:25: FutureWarning: The series.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. cat_counts = cat_counts_top.append(cat_counts_other) C:\Users\ASUS\AppData\Local\Temp\ipykernel_21616\13778241.py:25: FutureWarning: The series.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead. cat_counts = cat_counts_top.append(cat_counts_other)
df[{'founded','Industry','company_name'}]
graph=px.line(df,x='founded',y='Industry',color='founded',title='Industry',range_x=[1854,2022])
graph.show()
df[{'founded','region','company_name'}]
graph=px.line(df,x='founded',y='region',color='founded',title='region',range_x=[1854,2022])
graph.show()
C:\Users\ASUS\AppData\Local\Temp\ipykernel_21616\2205364330.py:1: FutureWarning: Passing a set as an indexer is deprecated and will raise in a future version. Use a list instead.
df[{'founded','Industry','company_name'}]
C:\Users\ASUS\AppData\Local\Temp\ipykernel_21616\2205364330.py:5: FutureWarning: Passing a set as an indexer is deprecated and will raise in a future version. Use a list instead.
Grafico de torta por industria
df_torta = df[{'region','estimated_revenues'}]
fig=px.pie(df_torta,values='estimated_revenues',color='region',names='region',labels='region',width=800,height=600,hole=0.2)
fig.show()
C:\Users\ASUS\AppData\Local\Temp\ipykernel_21616\2421618612.py:1: FutureWarning: Passing a set as an indexer is deprecated and will raise in a future version. Use a list instead.
# Revisar la cantidad de los valores perdidos
revisar_datosperdidos = df.isnull().sum()* 100 / df.shape[0] #Este codigo muestra el porcentaje de los datos perdidos
revisar_datosperdidos[revisar_datosperdidos > 0].sort_values(ascending=False)
Series([], dtype: float64)
# Como en este dataset todos los datos están completos se dejan quietos
# En el caso coontrario donde hubiesen datos nulos mayores al 30% se eliminan df.drop(columns:"")
# cuando dos columnas se encuentren con la misma cantidad de datos null y sean >30% se procede a rellenar con el promedio
# df["columna_x"].fillna(df["columna_x"].mean(, inplace=True))
# df["columna_y"].fillna(df["columna_y"].mean(, inplace=True))
for col in df.select_dtypes(include=['object']).columns:
#Imprimir el nombre de las columnas y los valores unicos
print(f"{col}: {df[col].unique()}")
from sklearn import preprocessing
# recorra cada columna en el marco de datos donde dtype es 'objeto'
for col in df.select_dtypes(include=['object']).columns:
#inicializar un objeto codificador de etiquetas
label_encoder = preprocessing.LabelEncoder()
#ajustar el codificador a los valores únicos en la columna
label_encoder.fit(df[col].unique())
#Transforma la columna usando el codificador
df[col] = label_encoder.transform(df[col])
#imprimir el nombre de la columna y los valores codificados únicos
print(f"{col}: {df[col].unique()}")
# Correlation heatmap
plt.figure(figsize=(15, 10))
sns.heatmap(df.corr(), fmt='.2g', annot=False)
<Axes: >